# extract title node from wiki xml dump
# remove characters not in \w
# to lowercase

#!/usr/bin/perl

use strict;
use warnings;

if (@ARGV != 1) {
	print "usage: perl extract_text_node.pl FileName\n";
	die;
}

my $file = $ARGV[0];
open FILE, $file or die "can't open file $file";

my $tmp;
my $text = "";

while (<FILE>) {
	chmod;
	$text .= $_;
	if ($text =~ /<text.*?>([\S\s]*?)<\/text>/) {
		$tmp = $1;
    	$tmp =~ s/\n/ /g;
    	$tmp =~ s/\W/ /g;
    	$tmp =~ s/(.)/\L$1/g;
		print "$tmp\n";
		$text = "";
	}
}
